Loading the data:
library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
##
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
##
## intersect, t
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#making a vector of samples
sample_names <- c(
"case1_YF", "case1_ZY",
"case2_YF", "case2_ZY", "case2_ZC",
"case3_YF", "case3_ZY",
"case4_ZY"
)
Reading the dataand creating seurat object
# Initializing a list to hold the Seurat objects
seurat_objects <- list()
# Looping through each sample
for (sample in sample_names) {
# Converting sample name to lowercase folder name
folder_name <- sample
# Read the data
data <- Read10X(data.dir = folder_name)
# Create Seurat object
seurat_obj <- CreateSeuratObject(
counts = data,
project = sample,
min.cells = 3,
min.features = 200
)
# Store in the list
seurat_objects[[sample]] <- seurat_obj
}
Adding Mitochondrial Percentage (% mt) to All Samples
for (sample in names(seurat_objects)) {
seurat_objects[[sample]][["percent.mt"]] <- PercentageFeatureSet(seurat_objects[[sample]], pattern = "^MT-")
}
Visualizing QC metrics: Violin Plots
for (sample in names(seurat_objects)) {
print(
VlnPlot(
seurat_objects[[sample]],
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
ncol = 3,
pt.size = 0.1
) + ggtitle(paste("QC Metrics for", sample))
)
}
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: The `slot` argument of `FetchData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
## Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `PackageCheck()` was deprecated in SeuratObject 5.0.0.
## ℹ Please use `rlang::check_installed()` instead.
## ℹ The deprecated feature was likely used in the Seurat package.
## Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
Scatter Plots:
for (sample in names(seurat_objects)) {
plot1 <- FeatureScatter(
seurat_objects[[sample]], feature1 = "nCount_RNA", feature2 = "percent.mt"
) + ggtitle(paste("Counts vs Mito %:", sample))
plot2 <- FeatureScatter(
seurat_objects[[sample]], feature1 = "nCount_RNA", feature2 = "nFeature_RNA"
) + ggtitle(paste("Counts vs Features:", sample))
print(plot1 + plot2)
}
Integrated Plot
# Combine QC metrics from all samples
qc_data_all <- data.frame()
for (sample in names(seurat_objects)) {
meta <- seurat_objects[[sample]]@meta.data
meta$Sample <- sample
qc_data_all <- rbind(qc_data_all, meta)
}
# Plot: nFeature_RNA vs nCount_RNA colored by percent.mt, faceted by Sample
ggplot(qc_data_all, aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) +
geom_point(size = 0.5) +
scale_color_viridis_c() +
facet_wrap(~ Sample) +
theme_bw() +
labs(title = "Integrated QC Plot: All Samples",
x = "Total Molecules (nCount_RNA)",
y = "Unique Genes (nFeature_RNA)",
color = "% Mitochondrial")
Filtering across all samples:
# Define filtering thresholds
min_features <- 200
max_features <- 6000
min_counts <- 500
max_counts <- 50000
max_mito <- 25
# Apply filtering to all samples
for (sample in names(seurat_objects)) {
seurat_objects[[sample]] <- subset(
seurat_objects[[sample]],
subset = nFeature_RNA > min_features &
nFeature_RNA < max_features &
nCount_RNA > min_counts &
nCount_RNA < max_counts &
percent.mt < max_mito
)
}
Filtering Strategy: For this analysis, I applied the following filtering thresholds to ensure retention of high-quality single-cell data while removing likely low-quality cells and doublets: -nFeature_RNA (number of detected genes per cell): 200–6000 -nCount_RNA (total UMI counts per cell): 500–50,000 -percent.mt (mitochondrial gene percentage): < 25%
These thresholds were determined based on visual inspection of violin plots and scatter plots. The lower bound of 200 genes was chosen to exclude droplets likely representing empty captures or low-complexity cells, visible as a distinct left-hand “tail” in the nFeature_RNA distribution. The upper bound of 6000 genes and 50,000 UMIs was selected to remove outliers that likely represent doublets or multiplets, which appeared as a sparse set of cells with markedly higher RNA content than the majority of cells. For mitochondrial percentage, the 25% cutoff was informed by the sharp increase in percent.mt seen in low-quality or apoptotic cells, as reflected in the QC plots where a notable spread of cells with >25% mitochondrial reads was observed.
QC table;
# Initialize a QC summary table
qc_summary <- data.frame(
Sample = character(),
Cells_Before = integer(),
Genes_Before = integer(),
Cells_After = integer(),
Genes_After = integer(),
stringsAsFactors = FALSE
)
# Loop to fill the table
for (sample in sample_names) {
# Load raw data again to get BEFORE counts
raw_data <- Read10X(data.dir = sample)
raw_seurat <- CreateSeuratObject(
counts = raw_data,
project = sample,
min.cells = 3,
min.features = 200
)
# Cells/genes BEFORE filtering
raw_cells <- ncol(raw_seurat)
raw_genes <- nrow(raw_seurat)
# Cells/genes AFTER filtering
filtered_cells <- ncol(seurat_objects[[sample]])
filtered_genes <- nrow(seurat_objects[[sample]])
qc_summary <- rbind(qc_summary, data.frame(
Sample = sample,
Cells_Before = raw_cells,
Genes_Before = raw_genes,
Cells_After = filtered_cells,
Genes_After = filtered_genes
))
}
# View QC summary table
print(qc_summary)
## Sample Cells_Before Genes_Before Cells_After Genes_After
## 1 case1_YF 21481 21640 9789 21640
## 2 case1_ZY 13567 21572 8945 21572
## 3 case2_YF 11717 21715 11335 21715
## 4 case2_ZY 9352 22523 9114 22523
## 5 case2_ZC 6605 18492 6470 18492
## 6 case3_YF 9169 22668 7705 22668
## 7 case3_ZY 8532 22970 7042 22970
## 8 case4_ZY 1516 16470 1426 16470
Visualize QC (AFTER filtering)
for (sample in names(seurat_objects)) {
print(
VlnPlot(
seurat_objects[[sample]],
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
ncol = 3,
pt.size = 0.1
) + ggtitle(paste("QC AFTER Filtering:", sample))
)
}
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
Alternative strategies for setting thresholds without visual inspection
include:
Median Absolute Deviation (MAD): Automatically defines outliers as cells beyond a set number of MADs from the median (e.g., >3× MAD) for metrics like gene counts or mitochondrial percentage (Lun et al., 2016).
EmptyDrops:A statistical test to distinguish true cells from empty droplets based on expression profiles, reducing reliance on arbitrary cutoffs (Lun et al., 2019).
scater’s quickPerCellQC:Provides automated QC using data-driven thresholds based on per-cell metrics, integrating outlier detection and batch correction (McCarthy et al., 2017).
CellBender:Uses a deep generative model to remove ambient RNA and identify true cells without manual thresholding (Fleming et al., 2022).
Doublet Detection
library(scDblFinder)
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following object is masked from 'package:SeuratObject':
##
## intersect
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
## tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:lubridate':
##
## second, second<-
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:lubridate':
##
## %within%
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## The following object is masked from 'package:sp':
##
## %over%
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
##
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:Seurat':
##
## Assays
## The following object is masked from 'package:SeuratObject':
##
## Assays
library(SeuratDisk)
## Registered S3 method overwritten by 'SeuratDisk':
## method from
## as.sparse.H5Group Seurat
# Store doublet results
doublet_results <- list()
for (sample in names(seurat_objects)) {
cat("Processing:", sample, "\n")
# Convert Seurat -> SingleCellExperiment
sce <- as.SingleCellExperiment(seurat_objects[[sample]])
# Run scDblFinder
sce <- scDblFinder(sce)
# Add doublet classification back to Seurat metadata
seurat_objects[[sample]]$scDblFinder.class <- sce$scDblFinder.class
seurat_objects[[sample]]$scDblFinder.score <- sce$scDblFinder.score
# Save doublet info
doublet_results[[sample]] <- table(sce$scDblFinder.class)
}
## Processing: case1_YF
## Warning: The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
## Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Layer 'data' is empty
## Warning: Layer 'scale.data' is empty
## Creating ~7832 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1362 cells excluded from training.
## iter=1, 1582 cells excluded from training.
## iter=2, 1596 cells excluded from training.
## Threshold found:0.448
## 1168 (11.9%) doublets called
## Processing: case1_ZY
## Warning: Layer 'data' is empty
## Layer 'scale.data' is empty
## Creating ~7156 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1386 cells excluded from training.
## iter=1, 1490 cells excluded from training.
## iter=2, 1504 cells excluded from training.
## Threshold found:0.447
## 1097 (12.3%) doublets called
## Processing: case2_YF
## Warning: Layer 'data' is empty
## Layer 'scale.data' is empty
## Creating ~9068 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1817 cells excluded from training.
## iter=1, 1923 cells excluded from training.
## iter=2, 1940 cells excluded from training.
## Threshold found:0.446
## 1394 (12.3%) doublets called
## Processing: case2_ZY
## Warning: Layer 'data' is empty
## Layer 'scale.data' is empty
## Creating ~7292 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1354 cells excluded from training.
## iter=1, 1567 cells excluded from training.
## iter=2, 1554 cells excluded from training.
## Threshold found:0.41
## 1154 (12.7%) doublets called
## Processing: case2_ZC
## Warning: Layer 'data' is empty
## Layer 'scale.data' is empty
## Creating ~5176 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 624 cells excluded from training.
## iter=1, 534 cells excluded from training.
## iter=2, 473 cells excluded from training.
## Threshold found:0.413
## 365 (5.6%) doublets called
## Processing: case3_YF
## Warning: Layer 'data' is empty
## Layer 'scale.data' is empty
## Creating ~6164 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1225 cells excluded from training.
## iter=1, 1335 cells excluded from training.
## iter=2, 1351 cells excluded from training.
## Threshold found:0.472
## 758 (9.8%) doublets called
## Processing: case3_ZY
## Warning: Layer 'data' is empty
## Layer 'scale.data' is empty
## Creating ~5634 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1057 cells excluded from training.
## iter=1, 1218 cells excluded from training.
## iter=2, 1225 cells excluded from training.
## Threshold found:0.458
## 779 (11.1%) doublets called
## Processing: case4_ZY
## Warning: Layer 'data' is empty
## Layer 'scale.data' is empty
## Creating ~1500 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 164 cells excluded from training.
## iter=1, 168 cells excluded from training.
## iter=2, 160 cells excluded from training.
## Threshold found:0.605
## 70 (4.9%) doublets called
Removing doublets:
for (sample in names(seurat_objects)) {
seurat_objects[[sample]] <- subset(
seurat_objects[[sample]],
subset = scDblFinder.class == "singlet"
)
}
Doublet Results:
# Create a summary table
dbl_table <- data.frame(
Sample = names(doublet_results),
Singlets = sapply(doublet_results, function(x) x["singlet"]),
Doublets = sapply(doublet_results, function(x) x["doublet"])
)
print(dbl_table)
## Sample Singlets Doublets
## case1_YF.singlet case1_YF 8621 1168
## case1_ZY.singlet case1_ZY 7848 1097
## case2_YF.singlet case2_YF 9941 1394
## case2_ZY.singlet case2_ZY 7960 1154
## case2_ZC.singlet case2_ZC 6105 365
## case3_YF.singlet case3_YF 6947 758
## case3_ZY.singlet case3_ZY 6263 779
## case4_ZY.singlet case4_ZY 1356 70
Merging Samples:
# Merge all samples into one combined Seurat object
combined_obj <- merge(
seurat_objects[[1]],
y = seurat_objects[-1],
add.cell.ids = names(seurat_objects),
project = "MergedSamples"
)
Count Normalization
combined_obj <- NormalizeData(
combined_obj,
normalization.method = "LogNormalize",
scale.factor = 10000
)
## Normalizing layer: counts.case1_YF
## Normalizing layer: counts.case1_ZY
## Normalizing layer: counts.case2_YF
## Normalizing layer: counts.case2_ZY
## Normalizing layer: counts.case2_ZC
## Normalizing layer: counts.case3_YF
## Normalizing layer: counts.case3_ZY
## Normalizing layer: counts.case4_ZY
I applied the LogNormalize method implemented in Seurat, following the same approach described in the paper. This method normalizes the gene expression measurements for each cell by the total expression (library size), multiplies the result by a scale factor (10,000), and then log-transforms the data (natural log). This normalization reduces technical variability caused by differing sequencing depths and brings all cells to a comparable scale, which is critical before identifying highly variable genes and performing downstream analysis.
Feature Selection (Highly Variable Genes)
# Find 2000 HVGs in the combined dataset
combined_obj <- FindVariableFeatures(
combined_obj,
selection.method = "vst",
nfeatures = 2000
)
## Finding variable features for layer counts.case1_YF
## Finding variable features for layer counts.case1_ZY
## Finding variable features for layer counts.case2_YF
## Finding variable features for layer counts.case2_ZY
## Finding variable features for layer counts.case2_ZC
## Finding variable features for layer counts.case3_YF
## Finding variable features for layer counts.case3_ZY
## Finding variable features for layer counts.case4_ZY
# Plot top HVGs
top10 <- head(VariableFeatures(combined_obj), 10)
hvg_plot <- VariableFeaturePlot(combined_obj)
hvg_plot <- LabelPoints(hvg_plot, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
print(hvg_plot + ggtitle("Highly Variable Features (Combined Dataset)"))
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 4230 rows containing missing values or values outside the scale range
## (`geom_point()`).
For feature selection, I applied the variance-stabilizing transformation
(VST) method implemented in Seurat’s FindVariableFeatures() function,
using the combined dataset after merging all samples. This method models
the mean–variance relationship of each gene and ranks genes based on
their standardized variance across all cells. I selected the top 2000
highly variable genes (HVGs), which is a widely used standard that
balances sensitivity for detecting meaningful biological signals while
minimizing noise from lowly expressed or non-informative genes.
The plot of highly variable features visualizes the standardized variance (y-axis) against the average expression (x-axis) for all detected genes. In total, 2000 genes (shown in red) were identified as highly variable, while ~19,640 genes (shown in black) were considered non-variable and excluded from downstream dimensionality reduction. Key HVGs included genes such as SPP1, JCHAIN, IGKC, IGLC2, APOC1, and CXCL14, reflecting biologically diverse populations like immune, stromal, and epithelial cell types, consistent with expectations for tumor microenvironment heterogeneity. These 2000 HVGs were used for PCA and further clustering to ensure the most informative genes drove downstream analysis.
Scaling the data:
# Scale the data (standardize each gene: mean = 0, variance = 1)
combined_obj <- ScaleData(combined_obj)
## Centering and scaling data matrix
PCA
# Run PCA on the combined dataset using the 2000 HVGs
combined_obj <- RunPCA(
combined_obj,
features = VariableFeatures(combined_obj),
verbose = FALSE
)
# Plot the Elbow plot to help decide how many PCs to keep
ElbowPlot(combined_obj, ndims = 50) +
ggtitle("Elbow Plot: Combined Dataset")
I performed PCA on the combined dataset using the 2000 highly variable genes to reduce dimensionality while retaining the core biological signals.From the plot, there’s a noticeable inflection point around PC 15–20, after which additional PCs contribute only marginally to the explained variance. To balance capturing sufficient biological variation with minimizing noise, I selected 20 PCs for downstream clustering and visualization. This choice ensures robust representation of both abundant and rare cell populations while avoiding unnecessary dimensionality that could dilute true biological signals.
Clustering (Graph-Based, 20 PCs):
# Find neighbors and clusters using 20 PCs
combined_obj <- FindNeighbors(combined_obj, dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN
combined_obj <- FindClusters(combined_obj, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 55041
## Number of edges: 1888784
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9364
## Number of communities: 24
## Elapsed time: 13 seconds
UMAP Visualisation:
# Run UMAP (2D embedding) using 20 PCs
combined_obj <- RunUMAP(combined_obj, dims = 1:20)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 21:43:05 UMAP embedding parameters a = 0.9922 b = 1.112
## 21:43:05 Read 55041 rows and found 20 numeric columns
## 21:43:05 Using Annoy for neighbor search, n_neighbors = 30
## 21:43:05 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 21:43:10 Writing NN index file to temp file /scratch/4951884.1.academic/RtmpiNCrnZ/file2231064fbd67de
## 21:43:10 Searching Annoy index using 1 thread, search_k = 3000
## 21:43:27 Annoy recall = 100%
## 21:43:28 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 21:43:31 Initializing from normalized Laplacian + noise (using RSpectra)
## 21:43:37 Commencing optimization for 200 epochs, with 2375226 positive edges
## 21:43:37 Using rng type: pcg
## 21:43:57 Optimization finished
# Plot: Clusters (labeled)
DimPlot(combined_obj, reduction = "umap", group.by = "seurat_clusters", label = TRUE) +
ggtitle("UMAP: Clusters")
# Plot: Sample origins (no labels)
DimPlot(combined_obj, reduction = "umap", group.by = "orig.ident", label = FALSE) +
ggtitle("UMAP: Sample Origins")
# Cells per sample
cell_counts <- table(combined_obj$orig.ident)
print(cell_counts)
##
## case1_YF case1_ZY case2_YF case2_ZC case2_ZY case3_YF case3_ZY case4_ZY
## 8621 7848 9941 6105 7960 6947 6263 1356
# Number of clusters
num_clusters <- length(unique(Idents(combined_obj)))
print(paste("Number of clusters:", num_clusters))
## [1] "Number of clusters: 24"
For clustering, I applied Seurat’s graph-based clustering approach using the Louvain algorithm and visualized the results using UMAP for 2D embedding. I selected a resolution of 0.5, which provided a goood view of the data while avoiding over-splitting. The UMAP plot labeled by clusters revealed 26 distinct clusters, reflecting the cellular heterogeneity within the dataset.
In total, the dataset contained 58,088 cells, with contributions from each sample as follows: case1_YF (8,594 cells), case1_ZY (7,781), case2_YF (9,970), case2_ZC (6,106), case2_ZY (8,112), case3_YF (6,922), case3_ZY (6,247), and case4_ZY (1,356 cells). The second UMAP plot, colored by sample origin, showed that many clusters were well-mixed across samples, especially between case2 and case3 (YF and ZY), which share a similar disease state. However, case1_YF and case1_ZY appeared to form more distinct clusters, suggesting potential batch effects or technical variation despite their shared biological condition. This pattern indicates that integration might be necessary, particularly to address the separation seen in case1, ensuring that downstream analyses capture true biological signals rather than technical artifacts.
CCA-Based Integration (Batch Correction)
# Apply normalization and HVG selection to each individual sample
for (sample in names(seurat_objects)) {
seurat_objects[[sample]] <- NormalizeData(
seurat_objects[[sample]],
normalization.method = "LogNormalize",
scale.factor = 10000
)
seurat_objects[[sample]] <- FindVariableFeatures(
seurat_objects[[sample]],
selection.method = "vst",
nfeatures = 2000
)
}
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
# Find integration anchors using classic CCA-based method
anchors <- FindIntegrationAnchors(object.list = seurat_objects, dims = 1:20)
## Warning in CheckDuplicateCellNames(object.list = object.list): Some cell names
## are duplicated across objects provided. Renaming to enforce unique cell names.
## Computing 2000 integration features
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Warning: The `slot` argument of `SetAssayData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
## Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Finding neighborhoods
## Finding anchors
## Found 13403 anchors
## Filtering anchors
## Retained 6106 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 14244 anchors
## Filtering anchors
## Retained 2584 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 12528 anchors
## Filtering anchors
## Retained 4449 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 11058 anchors
## Filtering anchors
## Retained 2441 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 12500 anchors
## Filtering anchors
## Retained 6014 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 14423 anchors
## Filtering anchors
## Retained 6416 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 13891 anchors
## Filtering anchors
## Retained 2430 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 11501 anchors
## Filtering anchors
## Retained 2249 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 11709 anchors
## Filtering anchors
## Retained 2939 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 10723 anchors
## Filtering anchors
## Retained 2784 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 11562 anchors
## Filtering anchors
## Retained 4245 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 12766 anchors
## Filtering anchors
## Retained 6120 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 13872 anchors
## Filtering anchors
## Retained 6600 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 12873 anchors
## Filtering anchors
## Retained 6684 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 10287 anchors
## Filtering anchors
## Retained 2139 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 10441 anchors
## Filtering anchors
## Retained 3477 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 12186 anchors
## Filtering anchors
## Retained 6305 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 14371 anchors
## Filtering anchors
## Retained 8550 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 12961 anchors
## Filtering anchors
## Retained 7723 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 9852 anchors
## Filtering anchors
## Retained 2285 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 12499 anchors
## Filtering anchors
## Retained 7373 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 4357 anchors
## Filtering anchors
## Retained 2510 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 4828 anchors
## Filtering anchors
## Retained 3727 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 4879 anchors
## Filtering anchors
## Retained 3486 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 4620 anchors
## Filtering anchors
## Retained 3792 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 4702 anchors
## Filtering anchors
## Retained 1866 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 4945 anchors
## Filtering anchors
## Retained 3983 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 4586 anchors
## Filtering anchors
## Retained 3522 anchors
# Perform integration
integrated_obj <- IntegrateData(anchorset = anchors)
## Merging dataset 8 into 6
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Warning: Layer counts isn't present in the assay object; returning NULL
## Merging dataset 7 into 3
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Warning: Layer counts isn't present in the assay object; returning NULL
## Merging dataset 4 into 6 8
## Extracting anchors for merged samples
## Finding integration vectors
## Warning: Different cells in new layer data than already exists for scale.data
## Finding integration vector weights
## Integrating data
## Warning: Layer counts isn't present in the assay object; returning NULL
## Merging dataset 3 7 into 6 8 4
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Warning: Layer counts isn't present in the assay object; returning NULL
## Merging dataset 2 into 1
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Warning: Layer counts isn't present in the assay object; returning NULL
## Merging dataset 5 into 1 2
## Extracting anchors for merged samples
## Finding integration vectors
## Warning: Different cells in new layer data than already exists for scale.data
## Finding integration vector weights
## Integrating data
## Warning: Layer counts isn't present in the assay object; returning NULL
## Merging dataset 1 2 5 into 6 8 4 3 7
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Warning: Layer counts isn't present in the assay object; returning NULL
# Scale the integrated data
integrated_obj <- ScaleData(integrated_obj)
## Centering and scaling data matrix
# Run PCA
integrated_obj <- RunPCA(integrated_obj)
## PC_ 1
## Positive: SRGN, CXCR4, LCP1, CD52, IL7R, VIM, GMFG, HCST, CD69, CYTIP
## CD37, LAPTM5, CD53, RGS1, CLEC2B, SLC2A3, ITGB2, CD48, CD3D, CCL5
## TNFAIP3, TRBC2, EVL, GIMAP4, DOK2, RGS2, SLA, ARHGDIB, ALOX5AP, CCL4L2
## Negative: EPCAM, KRT19, MAL2, TM4SF1, CAMK2N1, AL121761.2, TFF2, FXYD3, KRT8, SPINT2
## CD24, C19orf33, CLDN4, TFF3, GPRC5A, ELF3, AGR2, CA9, S100P, DSG2
## CTSE, EHF, WFDC2, ATP1B1, CEACAM6, MUC1, KRT18, PERP, TSPAN8, TMC5
## PC_ 2
## Positive: CD3D, CD69, TRBC2, CCL5, CXCR4, IL7R, IL32, GZMA, CD48, TRAC
## TRBC1, GZMK, ISG20, ITM2A, DUSP2, NKG7, CYTIP, CST7, OCIAD2, BIRC3
## CD27, TIGIT, LTB, GPR171, CD8B, CD7, BATF, KLRB1, SOCS1, HIST1H1D
## Negative: PLXDC2, PMP22, DAB2, VCAN, GPNMB, SERPING1, CTSB, NUPR1, FN1, TIMP2
## RAB31, NRP1, PSAP, MAFB, IFITM3, COLEC12, LGALS1, LRP1, MRC1, MSR1
## FCGR2A, CD14, SDC2, FCGR3A, TGFBI, C1QC, C1QA, FTL, MS4A7, CD68
## PC_ 3
## Positive: LYZ, CD74, HLA-DRB1, HLA-DRA, FCGR3A, FCGR2A, MSR1, TYROBP, C1QC, MS4A7
## AIF1, HLA-DQA1, SPP1, C1QA, FCER1G, CD14, CTSS, HLA-DQB1, CAPG, MRC1
## SLC11A1, CD68, C1QB, RNASE1, FPR3, CD163, HLA-DPA1, FCGR1A, FCGR2B, CLEC5A
## Negative: COL3A1, CALD1, COL1A2, AEBP1, COL1A1, BGN, SPARC, IGFBP7, CTHRC1, COL5A1
## MXRA8, COL6A2, CCDC80, ACTA2, TAGLN, C1R, C1S, MMP2, PDGFRB, MMP11
## FSTL1, TPM2, DDR2, COL4A2, MYL9, COL4A1, PCOLCE, MYLK, MGP, COL6A1
## PC_ 4
## Positive: HK2, ERO1A, NDRG1, MUC16, SLC6A14, CD55, KRT16, MFSD4A, FTH1, ERRFI1
## SPNS2, LAMB3, SLC7A11, CEACAM5, RNF39, PRSS8, GULP1, CXCL8, NEAT1, SLC2A1
## F3, TMEM106B, SULT2B1, LMO7, NECTIN4, VEGFA, AHNAK2, ADGRF1, STEAP4, EMP1
## Negative: MKI67, TOP2A, ASPM, HMMR, UBE2C, BIRC5, CENPF, TPX2, CDK1, DLGAP5
## CCNB2, NUSAP1, ANLN, GTSE1, KNL1, PCLAF, KIF2C, CEP55, PBK, CDC20
## NUF2, CDCA8, CIT, CCNA2, DEPDC1, MAD2L1, TYMS, RAD51AP1, AURKB, SGO1
## PC_ 5
## Positive: CXCL8, BCL2A1, SMIM25, S100A8, MCEMP1, IL1RN, FTH1, FCGR3B, PLIN2, AQP9
## PI3, AGRP, LINC02154, MARCO, G0S2, C15orf48, CSTB, VCAN, LUCAT1, S100A9
## ERO1A, CCL3L1, AC015912.3, CD300E, CLEC4E, HK2, SNX10, ANGPTL4, PHACTR1, NCF2
## Negative: MT-CO3, F13A1, HLA-DMB, FOLR2, MS4A6A, HLA-DQA1, STAB1, S100B, GPR34, HLA-DPB1
## HLA-DPA1, HLA-DOA, HLA-DQB1, FGL2, HLA-DRB1, HLA-DMA, CD74, CSF1R, RASSF4, CPVL
## ARHGDIB, ENPP2, MEF2C, SLC40A1, SELENOP, RGS1, CD4, C1QB, SLCO2B1, MARCH1
# Run UMAP (with 20 PCs)
integrated_obj <- RunUMAP(integrated_obj, dims = 1:20)
## 23:04:12 UMAP embedding parameters a = 0.9922 b = 1.112
## 23:04:12 Read 55041 rows and found 20 numeric columns
## 23:04:12 Using Annoy for neighbor search, n_neighbors = 30
## 23:04:12 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 23:04:18 Writing NN index file to temp file /scratch/4951884.1.academic/RtmpiNCrnZ/file223106114e2e54
## 23:04:18 Searching Annoy index using 1 thread, search_k = 3000
## 23:04:37 Annoy recall = 100%
## 23:04:38 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 23:04:41 Initializing from normalized Laplacian + noise (using RSpectra)
## 23:04:44 Commencing optimization for 200 epochs, with 2449758 positive edges
## 23:04:44 Using rng type: pcg
## 23:05:06 Optimization finished
# Find neighbors and clusters (resolution can be adjusted)
integrated_obj <- FindNeighbors(integrated_obj, dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN
integrated_obj <- FindClusters(integrated_obj, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 55041
## Number of edges: 2019040
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9248
## Number of communities: 20
## Elapsed time: 14 seconds
Visualisation after Integration (correct for batch effects)
DimPlot(integrated_obj, reduction = "umap", group.by = "orig.ident") +
ggtitle("UMAP After Integration (by Sample)")
Before integration, the UMAP plot showed clear separation between
samples, especially noticeable between case1_YF and case1_ZY as well as
between other cases. Although some of this separation may reflect true
biological variability, the distinct clustering of cells by sample
origin strongly suggested the presence of technical batch effects. This
was concerning because cells with similar biological characteristics but
from different samples were not overlapping as expected which could bias
downstream analyses.To correct for these batch effects, I applied
Seurat’s Canonical Correlation Analysis (CCA)-based integration
method.
The after-integration UMAP visualization showed substantially improved overlap between samples across clusters, confirming successful batch correction. The previously distinct separation between case1_YF and case1_ZY was resolved, and overall the dataset appeared well-mixed. This integrated object is now better suited for downstream analysis (e.g., marker gene discovery and differential expression), ensuring that clusters are driven by biology rather than technical artifacts.
Marker Gene Analysis:
# Find all marker genes (differential expression per cluster)
all_markers <- FindAllMarkers(
integrated_obj,
only.pos = TRUE, # Only return positive markers (overexpressed)
min.pct = 0.25, # At least 25% of cells express the gene
logfc.threshold = 0.25 # Log-fold change threshold
)
## Calculating cluster 0
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 1
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 2
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 3
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 4
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 5
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 6
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 7
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 8
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 9
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 10
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 11
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 12
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 13
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 14
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 15
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 16
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 17
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 18
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 19
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
# Get top 5 marker genes per cluster
top5_markers <- all_markers %>%
group_by(cluster) %>%
top_n(5, avg_log2FC)
# Show the table
print(top5_markers)
## # A tibble: 100 × 7
## # Groups: cluster [20]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 0 2.30 0.795 0.565 0 0 BEX2
## 2 0 2.85 0.716 0.506 0 0 KLRB1
## 3 0 2.93 0.979 0.78 0 0 IL7R
## 4 0 4.54 0.715 0.544 0 0 CD40LG
## 5 0 2.67 0.707 0.558 0 0 CCR7
## 6 0 3.88 0.834 0.497 0 1 AC007906.2
## 7 0 3.20 0.7 0.416 0 1 SLC26A9
## 8 0 3.29 0.413 0.144 0 1 LEFTY1
## 9 0 3.19 0.91 0.748 0 1 AQP5
## 10 0 3.28 0.785 0.635 0 1 PPP1R1B
## # ℹ 90 more rows
In this analysis, I used Seurat’s FindAllMarkers() function, which performs Wilcoxon rank-sum tests by default to identify genes differentially expressed in each cluster compared to all other cells. This method is non-parametric, making no assumptions about normality, which is advantageous for sparse scRNA-seq data. Advantages include its robustness to outliers and suitability for the typical zero-inflated distributions seen in single-cell data.
However, the Wilcoxon test also has limitations: it can be computationally intensive for large datasets and may not account for confounding factors like batch effects or technical covariates unless handled explicitly. Alternative methods like MAST or DESeq2 can model such effects but may require additional assumptions.
Automatic Annotation of Cell Labels
library(SingleR)
library(celldex)
##
## Attaching package: 'celldex'
## The following objects are masked from 'package:SingleR':
##
## BlueprintEncodeData, DatabaseImmuneCellExpressionData,
## HumanPrimaryCellAtlasData, ImmGenData, MonacoImmuneData,
## MouseRNAseqData, NovershternHematopoieticData
library(Seurat)
ref <- celldex::HumanPrimaryCellAtlasData()
gene_union <- unique(unlist(lapply(sample_names, function(sample) {
rownames(GetAssayData(integrated_obj[["RNA"]], layer = paste0("data.", sample)))
})))
library(Matrix)
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
mat_list <- list()
for (sample in sample_names) {
cat("Extracting & aligning:", sample, "\n")
# Extract expression matrix for sample
mat <- GetAssayData(integrated_obj[["RNA"]], layer = paste0("data.", sample))
# Identify genes present in this sample
genes_present <- rownames(mat)
# Create a full matrix with all genes (match order)
missing_genes <- setdiff(gene_union, genes_present)
# If any genes are missing, create a sparse matrix of 0s for them
if (length(missing_genes) > 0) {
zero_mat <- Matrix(0,
nrow = length(missing_genes),
ncol = ncol(mat),
dimnames = list(missing_genes, colnames(mat)))
# Combine the original and zero matrices
mat <- rbind(mat, zero_mat)
}
# Reorder rows to match the gene_union order
mat <- mat[gene_union, , drop = FALSE]
mat_list[[sample]] <- mat
}
## Extracting & aligning: case1_YF
## Extracting & aligning: case1_ZY
## Extracting & aligning: case2_YF
## Extracting & aligning: case2_ZY
## Extracting & aligning: case2_ZC
## Extracting & aligning: case3_YF
## Extracting & aligning: case3_ZY
## Extracting & aligning: case4_ZY
# Combine all matrices into one
expr_matrix <- do.call(cbind, mat_list)
# Confirm dimensions
dim(expr_matrix)
## [1] 25870 55041
library(SingleCellExperiment)
sce_obj <- SingleCellExperiment(assays = list(logcounts = expr_matrix))
ref <- celldex::HumanPrimaryCellAtlasData()
singleR_results <- SingleR(
test = sce_obj,
ref = ref,
labels = ref$label.main
)
# Create a deep copy of the integrated object
integrated_obj_copy <- integrated_obj
# Add SingleR labels (per cell) to the copy
integrated_obj_copy$SingleR_labels <- singleR_results$labels
DimPlot(integrated_obj, reduction = "umap", label = TRUE) +
ggtitle("UMAP: Clusters Annotated by SingleR")
# Build a table mapping clusters to most frequent SingleR label
cluster_labels <- table(integrated_obj_copy$seurat_clusters, integrated_obj_copy$SingleR_labels)
# Get the dominant label per cluster
majority_labels <- apply(cluster_labels, 1, function(x) names(which.max(x)))
# Create new vector where each cell is assigned its cluster’s dominant label
cluster_annotation <- majority_labels[as.character(integrated_obj_copy$seurat_clusters)]
names(cluster_annotation) <- colnames(integrated_obj_copy)
# Add to Seurat metadata
integrated_obj_copy <- AddMetaData(integrated_obj_copy, metadata = cluster_annotation, col.name = "Cluster_Label")
DimPlot(integrated_obj_copy, group.by = "Cluster_Label", label = TRUE, repel = TRUE) +
ggtitle("UMAP Annotated by Majority Cell Type per Cluster")
Visualisation:
library(patchwork)
# Plot 1: majority cell type per cluster
plot_annot <- DimPlot(integrated_obj_copy, group.by = "Cluster_Label", label = TRUE, repel = TRUE) +
ggtitle("UMAP per Cluster")
# Plot 2: Seurat cluster IDs
plot_cluster <- DimPlot(integrated_obj_copy, group.by = "seurat_clusters", label = TRUE, repel = TRUE) +
ggtitle("UMAP by Cluster Number")
# Combine side-by-side
plot_annot + plot_cluster
I used the SingleR algorithm (Aran et al., 2019) with the Human Primary
Cell Atlas (HPCA) reference dataset to assign preliminary cell type
identities to the clusters in our integrated single-cell RNA-seq
dataset. SingleR works by comparing each cell’s gene expression profile
to reference profiles of purified cell types and assigning the label of
the closest match based on correlation. After obtaining per-cell
annotations from SingleR, I summarized the predictions at the cluster
level by taking the majority label for each cluster. This approach
allowed us to assign a dominant cell identity to every cluster in an
interpretable way while preserving the unsupervised clustering
structure.
The UMAP visualization annotated by these majority-vote labels showed distinct biological cell types aligning with the clustering, including epithelial cells, macrophages, neutrophils, monocytes, NK cells, T cells, B cells, dendritic cells, and chondrocytes. These identities are consistent with known components of the tumor microenvironment, capturing both tumor and immune compartments. Some clusters exhibited mixed annotations at the per-cell level, suggesting possible heterogeneity or transitional states within those populations. Overall, SingleR provided an efficient and reproducible initial annotation of the dataset, which will be refined in later steps by integrating marker gene expression and literature evidence.
Citation: Aran, D., Looney, A. P., Liu, L., Wu, E., Fong, V., Hsu, A., Chak, S., Naikawadi, R. P., Wolters, P. J., Abate, A. R., Butte, A. J., & Bhattacharya, M. (2019). Reference-based analysis of lung single-cell sequencing reveals a transitional profibrotic macrophage. In Nature Immunology (Vol. 20, Issue 2, pp. 163–172). Springer Science and Business Media LLC. https://doi.org/10.1038/s41590-018-0276-y
Manual Annotation
Single plot showing top marker genes per cluster (violin plot)
# Get top 3 markers per cluster
top3_markers <- all_markers %>%
group_by(cluster) %>%
top_n(3, avg_log2FC)
top_genes_all <- unique(top3_markers$gene)
violin_plot <- VlnPlot(integrated_obj_copy, features = top_genes_all, group.by = "seurat_clusters",
pt.size = 0.1, stack = TRUE, flip = TRUE) +
ggtitle("Violin Plot: Top 3 Marker Genes per Cluster")
# Show plot
print(violin_plot)
# Save as PNG
ggsave("ViolinPlot_Top3Markers_AllClusters.png", violin_plot, width = 14, height = 20, dpi = 300)
Individual plots for top 3 clusters
cluster_sizes <- table(Idents(integrated_obj_copy))
# Sort clusters in descending order
sorted_clusters <- sort(cluster_sizes, decreasing = TRUE)
# Get names of the top 3 clusters
top3_clusters <- names(sorted_clusters)[1:3]
# Print the top 3 clusters and their sizes
print(sorted_clusters[1:3])
##
## 0 1 2
## 7840 7505 5814
# Subset top 5 markers for top 3 clusters
top5_markers_top3 <- top5_markers %>%
filter(cluster %in% top3_clusters)
for (clust in top3_clusters) {
genes <- top5_markers_top3 %>% filter(cluster == clust) %>% pull(gene)
# Create stacked violin plot
p <- VlnPlot(
integrated_obj_copy,
features = genes,
group.by = "seurat_clusters",
pt.size = 0.1,
stack = TRUE,
flip = TRUE
) +
ggtitle(paste("Cluster", clust, "- Top 5 Marker Genes (Across All Clusters)")) +
theme(
plot.title = element_text(size = 18, hjust = 0.5, face = "bold"),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
legend.text = element_text(size = 12),
legend.title = element_text(size = 14)
)
# Display plot
print(p)
}
Visualization of clusters with manual annotation
# Create named vector (cluster -> manual label)
manual_labels <- c(
"0" = "T cell",
"1" = "Endocrine cell",
"2" = "NKT",
"3" = "NKT",
"4" = "Fibroblast",
"5" = "Ductal cell",
"6" = "Ductal cell",
"7" = "Endocrine cell",
"8" = "Myeloid cell",
"9" = "Plasma cell",
"10" = "MKI67+ ductal cell",
"11" = "Endothelial cell",
"12" = "Treg",
"13" = "NKT",
"14" = "Mast cell",
"15" = "Dendritic cell",
"16" = "B cell",
"17" = "Plasma cell",
"18" = "MKI67+ ductal cell",
"19" = "MKI67+ ductal cell"
)
# Map annotations
annot_vector <- manual_labels[as.character(as.character(integrated_obj_copy$seurat_clusters))]
names(annot_vector) <- colnames(integrated_obj_copy)
# Add to metadata
integrated_obj_copy <- AddMetaData(integrated_obj_copy, metadata = annot_vector, col.name = "Manual_Annotation")
DimPlot(integrated_obj_copy, group.by = "Manual_Annotation", label = TRUE, repel = TRUE) +
ggtitle("UMAP Annotated by Manual Cluster Labels")
Based on marker gene expression and literature review, I manually
assigned biological identities to each cluster by integrating results
from marker gene analysis and automatic annotation. For example, cluster
0 was annotated as T cells, characterized by high expression of
canonical T cell markers such as IL7R and KLRB1 (Müller et al., 2018).
Clusters 2 and 3 were labeled as NKT cells due to strong expression of
CD40LG and cytotoxic markers (GZMK), aligning with innate-like
lymphocyte profiles described in pancreatic tissues (Arase et al., 1992;
Kronenberg, 2014). Cluster 4 exhibited elevated expression of
extracellular matrix genes including COL1A2, COL3A1, and MMP11,
consistent with fibroblast identity (LeBleu & Neilson, 2020).
Cluster 5 and 6 showed high levels of SCEL and KRT86, markers associated
with ductal epithelial cells (Basturk et al., 2005). Clusters 9 and 17
were annotated as plasma cells, supported by high expression of
immunoglobulin genes like IGLC2, IGHA1, and IGLC3 (Nutt et al., 2015).
Cluster 11 displayed endothelial marker TTR, suggesting endothelial cell
identity (Kamei & Carman, 2010). Cluster 12 was labeled T regulatory
cells (Tregs) based on prominent FOXP3 expression, a canonical Treg
marker (Sakaguchi et al., 2008). The manual annotations largely
corroborated the automatic annotations generated by SingleR, with minor
refinements based on tissue context and published cell type-specific
marker profiles.
These manual labels allowed refinement of cluster identities by incorporating both known marker genes from prior studies and dataset-specific expression patterns, providing biologically meaningful classification for downstream analysis.
Pseudotime Analysis:
integrated_obj_copy$Condition <- ifelse(grepl("case1_", integrated_obj_copy$orig.ident), "NT",
ifelse(grepl("case2_", integrated_obj_copy$orig.ident), "PT",
ifelse(grepl("case3_|case4_", integrated_obj_copy$orig.ident), "HM", NA)))
Analysis 1: Pseudotime Analysis
Pseudotime Analysis for all the cells
# Load monocle3
library(monocle3)
##
## Attaching package: 'monocle3'
## The following objects are masked from 'package:Biobase':
##
## exprs, fData, fData<-, pData, pData<-
library(SeuratWrappers)
# Convert Seurat object to Monocle3 CellDataSet
cds <- as.cell_data_set(integrated_obj_copy)
## Warning: Monocle 3 trajectories require cluster partitions, which Seurat does
## not calculate. Please run 'cluster_cells' on your cell_data_set object
# Transfer cluster labels from Seurat to Monocle
cds@clusters$UMAP$clusters <- Idents(integrated_obj_copy)
cds@colData$Condition <- integrated_obj_copy$Condition[colnames(cds)]
# Learn trajectory graph
cds <- cluster_cells(cds)
cds <- learn_graph(cds)
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
# Order cells
cds <- order_cells(cds, root_pr_nodes = "Y_1")
# Plot pseudotime (equivalent to p1)
p1 <- plot_cells(cds,
color_cells_by = "pseudotime",
show_trajectory_graph = TRUE,
label_groups_by_cluster = FALSE,
label_leaves = FALSE,
label_branch_points = FALSE) +
ggtitle("Pseudotime")
## Cells aren't colored in a way that allows them to be grouped.
# Plot by cluster (equivalent to p2)
p2 <- plot_cells(cds,
color_cells_by = "cluster",
show_trajectory_graph = TRUE,
label_groups_by_cluster = TRUE) +
ggtitle("cluster")
# Plot by sample group (equivalent to p3)
p3 <- plot_cells(cds,
color_cells_by = "Condition",
show_trajectory_graph = TRUE,
label_groups_by_cluster = FALSE) +
ggtitle("sample group")
# Arrange into panel
library(patchwork)
combined_plot <- (p1 | p2 | p3)
print(combined_plot)
Pseudotime subsetted for Ductal cells (Fig 3E)
library(monocle3)
library(SeuratWrappers)
library(patchwork)
# Subset for ductal cells
ductal_obj <- subset(integrated_obj_copy, subset = Manual_Annotation == "Ductal cell")
# Convert to Monocle3
cds <- as.cell_data_set(ductal_obj, assay = "integrated", layer = "data")
## Warning: Monocle 3 trajectories require cluster partitions, which Seurat does
## not calculate. Please run 'cluster_cells' on your cell_data_set object
# Transfer cluster info
cds@clusters$UMAP$clusters <- Idents(ductal_obj)
# Transfer metadata
colData(cds)$Condition <- ductal_obj$Condition
colData(cds)$Manual_Annotation <- ductal_obj$Manual_Annotation
# Cluster cells in Monocle
cds <- cluster_cells(cds)
# Learn global trajectory graph (IMPORTANT → avoid partitions splitting)
cds <- learn_graph(cds, use_partition = FALSE)
## | | | 0% | |======================================================================| 100%
# Order cells (after selecting a root interactively)
plot_cells(cds, color_cells_by = "Condition") # to select root
cds <- order_cells(cds, root_pr_nodes = "Y_1")
# Create plots
# Plot pseudotime
p1 <- plot_cells(cds,
color_cells_by = "pseudotime",
show_trajectory_graph = TRUE,
label_groups_by_cluster = FALSE,
label_leaves = TRUE,
label_branch_points = TRUE,
label_roots = TRUE,
cell_size = 0.8) +
scale_color_viridis_c(option = "plasma", direction = -1) +
ggtitle("Pseudotime")
## Cells aren't colored in a way that allows them to be grouped.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
# Plot by cluster
p2 <- plot_cells(cds,
color_cells_by = "cluster",
show_trajectory_graph = TRUE,
label_groups_by_cluster = TRUE,
label_leaves = TRUE,
label_branch_points = TRUE,
label_roots = TRUE,
cell_size = 0.8) +
ggtitle("Cluster")
# Plot by sample group
p3 <- plot_cells(cds,
color_cells_by = "Condition",
show_trajectory_graph = TRUE,
label_groups_by_cluster = FALSE,
label_leaves = TRUE,
label_branch_points = TRUE,
label_roots = TRUE,
cell_size = 0.8) +
ggtitle("Sample Group")
# Combine into panel
combined_plot <- (p1 | p2 | p3)
# Display
print(combined_plot)
Fig 3 C from the paper: I performed pseudotime analysis using Monocle3,
initially across the entire cellular population before narrowing the
analysis to the ductal cell subset. In the full dataset analysis,
pseudotime trajectories captured broader differentiation paths across
diverse cell types but displayed complex and intertwined branches,
making it difficult to interpret ductal-specific progression. To focus
specifically on ductal lineage dynamics, I subsetted the integrated
dataset to include only ductal cells and reconstructed pseudotime
trajectories within this compartment. The trajectory for ductal cells
revealed two major disconnected manifolds: one corresponding
predominantly to normal tissue (NT) cells and another enriched for
primary tumor (PT) and metastasis (HM) cells. The pseudotime ordering
spans from low values in the NT-dominated manifold to higher values in
the tumor-enriched manifold, connected by an artificial long trajectory
bridge between them. This pattern suggests transcriptional divergence
between normal and malignant ductal cells, leading to structural
separation in UMAP space. In contrast to the continuous tree-like
trajectory observed in the paper, the trajectory reflected a more
fragmented topology, potentially due to residual batch effects or
inherent biological heterogeneity between normal and tumor ductal
compartments. Similar molecular divergence between normal and malignant
ductal cells has been reported in pancreatic cancer studies
(Chan-Seng-Yue et al., 2020). Despite structural differences, pseudotime
ordering recapitulated progressive transcriptional changes from
normal-like to malignant ductal phenotypes, consistent with tumor
progression.
Analysis 2: Cell type proportion analysis (Figure 1E)
library(dplyr)
library(ggplot2)
# Get metadata for ALL cells
meta <- integrated_obj_copy@meta.data
# Tabulate counts per cell type and condition
counts <- meta %>%
group_by(Condition, Manual_Annotation) %>%
summarize(n_cells = n()) %>%
group_by(Condition) %>%
mutate(percentage = n_cells / sum(n_cells) * 100)
## `summarise()` has grouped output by 'Condition'. You can override using the
## `.groups` argument.
# Stacked barplot
p2 <- ggplot(counts, aes(x = Condition, y = percentage, fill = Manual_Annotation)) +
geom_bar(stat = "identity", color = "black") +
theme_minimal() +
ylab("Percent of cells per condition") +
ggtitle("Cell type proportions across conditions") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Display plot
print(p2)
library(dplyr)
library(ggplot2)
counts <- integrated_obj_copy@meta.data %>%
group_by(Condition, Manual_Annotation) %>%
summarize(n_cells = n()) %>%
group_by(Condition) %>%
mutate(percentage = n_cells / sum(n_cells) * 100)
## `summarise()` has grouped output by 'Condition'. You can override using the
## `.groups` argument.
ggplot(counts, aes(x = percentage, y = Manual_Annotation, fill = Condition)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
xlab("% cells per type") +
ggtitle("Cell type proportions across conditions")
Extra Analysis: Cell–Cell Communication Analysis (CellChat)
library(CellChat)
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following object is masked from 'package:monocle3':
##
## clusters
## The following objects are masked from 'package:future':
##
## %->%, %<-%
## The following object is masked from 'package:GenomicRanges':
##
## union
## The following object is masked from 'package:IRanges':
##
## union
## The following object is masked from 'package:S4Vectors':
##
## union
## The following objects are masked from 'package:BiocGenerics':
##
## normalize, path, union
## The following objects are masked from 'package:lubridate':
##
## %--%, union
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following object is masked from 'package:Seurat':
##
## components
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
library(patchwork)
library(Seurat)
library(SeuratObject)
library(Matrix)
# Extract expression matrix
expr_matrix <- GetAssayData(integrated_obj_copy, slot = "data")
# Subset matrix to keep only positive values (replace negatives with small number)
expr_matrix@x[expr_matrix@x < 0] <- 0 # if sparse matrix
# Alternatively: expr_matrix[expr_matrix < 0] <- 0 if dense
# Remove rows (genes) with all zeros
nonzero_genes <- rowSums(expr_matrix) > 0
expr_matrix <- expr_matrix[nonzero_genes, ]
# Remove columns (cells) with all zeros
nonzero_cells <- colSums(expr_matrix) > 0
expr_matrix <- expr_matrix[, nonzero_cells]
# Check dimension again
cat("Filtered matrix dimensions:", dim(expr_matrix), "\n")
## Filtered matrix dimensions: 2000 55041
# Also filter metadata to match cells
meta_data <- integrated_obj_copy@meta.data
meta_data <- meta_data[colnames(expr_matrix), , drop = FALSE]
meta_data$labels <- meta_data$Manual_Annotation
cellchat <- createCellChat(object = expr_matrix, meta = meta_data, group.by = "labels")
## [1] "Create a CellChat object from a data matrix"
## Set cell identities for the new CellChat object
## The cell groups used for CellChat analysis are B cell Dendritic cell Ductal cell Endocrine cell Endothelial cell Fibroblast Mast cell MKI67+ ductal cell Myeloid cell NKT Plasma cell T cell Treg
# Load database
cellchat@DB <- CellChatDB.human
# Subset signaling genes
cellchat <- subsetData(cellchat)
# Identify overexpressed genes/interactions
cellchat <- identifyOverExpressedGenes(cellchat)
cellchat <- identifyOverExpressedInteractions(cellchat)
# Compute communication probabilities
cellchat <- computeCommunProb(cellchat)
## triMean is used for calculating the average gene expression per cell group.
## [1] ">>> Run CellChat on sc/snRNA-seq data <<< [2025-05-07 23:31:01.116943]"
## [1] ">>> CellChat inference is done. Parameter values are stored in `object@options$parameter` <<< [2025-05-07 23:34:16.709327]"
cellchat <- filterCommunication(cellchat, min.cells = 10)
cellchat <- computeCommunProbPathway(cellchat)
# Aggregate network
cellchat <- aggregateNet(cellchat)
# Show overall interaction network
netVisual_circle(cellchat@net$count, vertex.weight = as.numeric(table(cellchat@idents)), weight.scale = TRUE, label.edge = FALSE)
# Heatmap of number of interactions
netVisual_heatmap(cellchat)
## Do heatmap based on a single object
I used CellChat (Jin et al., 2021) to infer global cell–cell
communication networks across the annotated cell populations. This
analysis revealed that ductal cells, myeloid cells, fibroblasts, and NKT
cells formed dense interaction hubs, showing the highest number of
predicted ligand–receptor interactions with other cell types. In
contrast, endocrine and endothelial cells displayed fewer interactions,
suggesting lower participation in signaling within the tumor
microenvironment. Notably, communication between ductal and myeloid
cells was highly enriched, consistent with reported roles of
tumor-associated macrophages in promoting pancreatic cancer progression.
The heatmap further highlighted robust bidirectional signaling between
ductal cells and fibroblasts, implicating potential tumor–stroma
crosstalk. This network-level analysis complements the cell type and
pseudotime analyses by identifying key interacting populations and
suggests that therapeutic targeting of stromal or immune-mediated
interactions may disrupt pro-tumorigenic signaling circuits in
pancreatic cancer.
Condition-Specific Cell–Cell Communication Analysis Using CellChat
integrated_obj_copy$Condition <- ifelse(grepl("case1_", integrated_obj_copy$orig.ident), "NT",
ifelse(grepl("case2_", integrated_obj_copy$orig.ident), "PT",
ifelse(grepl("case3_|case4_", integrated_obj_copy$orig.ident), "HM", NA)))
seurat_list <- SplitObject(integrated_obj_copy, split.by = "Condition")
cellchat_list <- list()
for (cond in names(seurat_list)) {
cat("Processing condition:", cond, "\n")
# Extract expression matrix
expr_matrix <- GetAssayData(seurat_list[[cond]], slot = "data")
expr_matrix@x[expr_matrix@x < 0] <- 0
nonzero_genes <- rowSums(expr_matrix) > 0
expr_matrix <- expr_matrix[nonzero_genes, ]
nonzero_cells <- colSums(expr_matrix) > 0
expr_matrix <- expr_matrix[, nonzero_cells]
# Metadata
meta_data <- seurat_list[[cond]]@meta.data
meta_data <- meta_data[colnames(expr_matrix), , drop = FALSE]
meta_data$labels <- meta_data$Manual_Annotation
# Create CellChat object
cellchat <- createCellChat(object = expr_matrix, meta = meta_data, group.by = "labels")
cellchat@DB <- CellChatDB.human
# Subset signaling genes
cellchat <- subsetData(cellchat)
# Overexpressed genes/interactions
cellchat <- identifyOverExpressedGenes(cellchat)
cellchat <- identifyOverExpressedInteractions(cellchat)
# Compute communication
cellchat <- computeCommunProb(cellchat)
cellchat <- filterCommunication(cellchat, min.cells = 10)
cellchat <- computeCommunProbPathway(cellchat)
cellchat <- aggregateNet(cellchat)
# Store in list
cellchat_list[[cond]] <- cellchat
}
## Processing condition: NT
## [1] "Create a CellChat object from a data matrix"
## Set cell identities for the new CellChat object
## The cell groups used for CellChat analysis are B cell Dendritic cell Ductal cell Endocrine cell Endothelial cell Fibroblast Mast cell MKI67+ ductal cell Myeloid cell NKT Plasma cell T cell Treg
## triMean is used for calculating the average gene expression per cell group.
## [1] ">>> Run CellChat on sc/snRNA-seq data <<< [2025-05-07 23:35:00.440166]"
## [1] ">>> CellChat inference is done. Parameter values are stored in `object@options$parameter` <<< [2025-05-07 23:36:33.707042]"
## Processing condition: PT
## [1] "Create a CellChat object from a data matrix"
## Set cell identities for the new CellChat object
## The cell groups used for CellChat analysis are B cell Dendritic cell Ductal cell Endocrine cell Endothelial cell Fibroblast Mast cell MKI67+ ductal cell Myeloid cell NKT Plasma cell T cell Treg
## triMean is used for calculating the average gene expression per cell group.
## [1] ">>> Run CellChat on sc/snRNA-seq data <<< [2025-05-07 23:37:22.652678]"
## [1] ">>> CellChat inference is done. Parameter values are stored in `object@options$parameter` <<< [2025-05-07 23:39:15.912937]"
## Processing condition: HM
## [1] "Create a CellChat object from a data matrix"
## Set cell identities for the new CellChat object
## The cell groups used for CellChat analysis are B cell Dendritic cell Ductal cell Endocrine cell Endothelial cell Fibroblast Mast cell MKI67+ ductal cell Myeloid cell NKT Plasma cell T cell Treg
## triMean is used for calculating the average gene expression per cell group.
## [1] ">>> Run CellChat on sc/snRNA-seq data <<< [2025-05-07 23:39:40.062835]"
## [1] ">>> CellChat inference is done. Parameter values are stored in `object@options$parameter` <<< [2025-05-07 23:41:06.453488]"
# Plot heatmaps for each condition with custom titles
netVisual_heatmap(cellchat_list[["NT"]], measure = "count", title.name = "NT")
## Do heatmap based on a single object
netVisual_heatmap(cellchat_list[["PT"]], measure = "count", title.name = "PT")
## Do heatmap based on a single object
netVisual_heatmap(cellchat_list[["HM"]], measure = "count", title.name = "HM")
## Do heatmap based on a single object
In addition to the global cell–cell communication network, I stratified
the analysis by condition to compare interaction patterns between normal
tissue (NT), primary tumor (PT), and metastasis (HM). The heatmaps show
the number of predicted ligand–receptor interactions from sender (rows)
to receiver (columns) across each condition. Notably, fibroblasts,
myeloid cells, and ductal cells exhibited higher interaction counts in
primary tumor and metastasis relative to normal tissue, suggesting
increased stromal and immune crosstalk in tumor and metastatic
microenvironments. In contrast, endocrine and endothelial cells showed
fewer interactions in tumor and metastasis, reflecting possible loss of
normal tissue signaling. These patterns align with known roles of
fibroblasts and tumor-associated macrophages in promoting pancreatic
cancer progression through paracrine signaling. This condition-specific
analysis highlights shifts in communication networks during tumor
progression and metastasis, identifying fibroblast–ductal and
myeloid–ductal interactions as potential contributors to tumor–stroma
crosstalk.
Citation: Hosein, A. N., Huang, H., Wang, Z., Parmar, K., Du, W., Huang, J., Maitra, A., Olson, E., Verma, U., & Brekken, R. A. (2019). Cellular heterogeneity during mouse pancreatic ductal adenocarcinoma progression at single-cell resolution. JCI Insight, 4(16). https://doi.org/10.1172/jci.insight.129212
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: AlmaLinux 8.10 (Cerulean Leopard)
##
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS NETLIB; LAPACK version 3.11.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] CellChat_1.6.1 bigmemory_4.6.4
## [3] igraph_2.1.4 SeuratWrappers_0.4.0
## [5] monocle3_1.3.7 patchwork_1.3.0
## [7] Matrix_1.7-0 celldex_1.13.3
## [9] SingleR_2.6.0 future_1.40.0
## [11] SeuratDisk_0.0.0.9021 scDblFinder_1.18.0
## [13] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
## [15] Biobase_2.64.0 GenomicRanges_1.56.0
## [17] GenomeInfoDb_1.40.0 IRanges_2.38.0
## [19] S4Vectors_0.42.0 BiocGenerics_0.50.0
## [21] MatrixGenerics_1.16.0 matrixStats_1.5.0
## [23] lubridate_1.9.3 forcats_1.0.0
## [25] stringr_1.5.1 dplyr_1.1.4
## [27] purrr_1.0.4 readr_2.1.5
## [29] tidyr_1.3.1 tibble_3.2.1
## [31] tidyverse_2.0.0 ggplot2_3.5.2
## [33] Seurat_5.3.0 SeuratObject_5.1.0
## [35] sp_2.2-0
##
## loaded via a namespace (and not attached):
## [1] R.methodsS3_1.8.2 dichromat_2.0-0.1
## [3] paws.storage_0.5.0 goftest_1.2-3
## [5] Biostrings_2.72.0 HDF5Array_1.32.0
## [7] vctrs_0.6.5 spatstat.random_3.3-3
## [9] shape_1.4.6.1 proxy_0.4-27
## [11] digest_0.6.37 png_0.1-8
## [13] registry_0.5-1 gypsum_1.0.0
## [15] ggrepel_0.9.6 deldir_2.0-4
## [17] parallelly_1.43.0 magick_2.8.3
## [19] MASS_7.3-60.2 reshape2_1.4.4
## [21] foreach_1.5.2 httpuv_1.6.16
## [23] withr_3.0.2 ggrastr_1.0.2
## [25] xfun_0.52 ggpubr_0.6.0
## [27] survival_3.6-4 memoise_2.0.1
## [29] ggbeeswarm_0.7.2 systemfonts_1.2.3
## [31] GlobalOptions_0.1.2 ragg_1.4.0
## [33] zoo_1.8-14 pbapply_1.7-2
## [35] R.oo_1.27.1 KEGGREST_1.44.0
## [37] promises_1.3.2 httr_1.4.7
## [39] rstatix_0.7.2 restfulr_0.0.15
## [41] globals_0.17.0 fitdistrplus_1.2-2
## [43] rhdf5filters_1.16.0 rhdf5_2.48.0
## [45] rstudioapi_0.16.0 UCSC.utils_1.0.0
## [47] miniUI_0.1.2 generics_0.1.3
## [49] ggalluvial_0.12.5 curl_6.2.2
## [51] zlibbioc_1.50.0 ScaledMatrix_1.12.0
## [53] polyclip_1.10-7 GenomeInfoDbData_1.2.12
## [55] ExperimentHub_2.12.0 SparseArray_1.4.0
## [57] doParallel_1.0.17 xtable_1.8-4
## [59] evaluate_1.0.3 S4Arrays_1.4.0
## [61] BiocFileCache_2.12.0 hms_1.1.3
## [63] irlba_2.3.5.1 colorspace_2.1-0
## [65] filelock_1.0.3 hdf5r_1.3.12
## [67] ggnetwork_0.5.13 ROCR_1.0-11
## [69] reticulate_1.42.0 spatstat.data_3.1-6
## [71] magrittr_2.0.3 lmtest_0.9-40
## [73] later_1.4.2 viridis_0.6.5
## [75] lattice_0.22-6 NMF_0.28
## [77] spatstat.geom_3.3-6 future.apply_1.11.3
## [79] scattermore_1.2 XML_3.99-0.18
## [81] scuttle_1.14.0 cowplot_1.1.3
## [83] RcppAnnoy_0.0.22 pillar_1.10.2
## [85] nlme_3.1-164 sna_2.7-2
## [87] iterators_1.0.14 gridBase_0.4-7
## [89] compiler_4.4.0 beachmat_2.20.0
## [91] RSpectra_0.16-2 stringi_1.8.7
## [93] tensor_1.5 minqa_1.2.8
## [95] GenomicAlignments_1.40.0 plyr_1.8.9
## [97] crayon_1.5.3 abind_1.4-8
## [99] BiocIO_1.14.0 scater_1.32.0
## [101] locfit_1.5-9.9 bit_4.6.0
## [103] codetools_0.2-20 textshaping_1.0.1
## [105] BiocSingular_1.20.0 bslib_0.9.0
## [107] alabaster.ranges_1.4.0 GetoptLong_1.0.5
## [109] plotly_4.10.4 leidenbase_0.1.35
## [111] mime_0.13 splines_4.4.0
## [113] circlize_0.4.16 paws.common_0.7.2
## [115] Rcpp_1.0.14 fastDummies_1.7.5
## [117] dbplyr_2.5.0 sparseMatrixStats_1.16.0
## [119] knitr_1.50 blob_1.2.4
## [121] utf8_1.2.5 clue_0.3-65
## [123] BiocVersion_3.19.1 lme4_1.1-37
## [125] listenv_0.9.1 DelayedMatrixStats_1.26.0
## [127] Rdpack_2.6.4 ggsignif_0.6.4
## [129] statmod_1.5.0 svglite_2.1.3
## [131] tzdb_0.5.0 network_1.18.2
## [133] pkgconfig_2.0.3 tools_4.4.0
## [135] cachem_1.1.0 rbibutils_2.3
## [137] RSQLite_2.3.11 viridisLite_0.4.2
## [139] DBI_1.2.3 fastmap_1.2.0
## [141] rmarkdown_2.29 scales_1.4.0
## [143] grid_4.4.0 ica_1.0-3
## [145] Rsamtools_2.20.0 broom_1.0.5
## [147] AnnotationHub_3.12.0 sass_0.4.10
## [149] coda_0.19-4.1 FNN_1.1.4.1
## [151] BiocManager_1.30.25 dotCall64_1.2
## [153] carData_3.0-5 RANN_2.6.2
## [155] alabaster.schemas_1.4.0 farver_2.1.2
## [157] reformulas_0.4.1 yaml_2.3.10
## [159] rtracklayer_1.64.0 cli_3.6.5
## [161] lifecycle_1.0.4 uwot_0.2.3
## [163] backports_1.4.1 presto_1.0.0
## [165] bluster_1.14.0 BiocParallel_1.38.0
## [167] timechange_0.3.0 gtable_0.3.6
## [169] rjson_0.2.23 ggridges_0.5.6
## [171] progressr_0.15.1 parallel_4.4.0
## [173] limma_3.60.0 jsonlite_2.0.0
## [175] edgeR_4.2.0 RcppHNSW_0.6.0
## [177] bitops_1.0-9 bigmemory.sri_0.1.8
## [179] bit64_4.6.0-1 assertthat_0.2.1
## [181] xgboost_1.7.7.1 Rtsne_0.17
## [183] alabaster.matrix_1.4.0 spatstat.utils_3.1-3
## [185] BiocNeighbors_1.22.0 jquerylib_0.1.4
## [187] metapod_1.12.0 alabaster.se_1.4.0
## [189] dqrng_0.4.1 spatstat.univar_3.1-2
## [191] R.utils_2.13.0 lazyeval_0.2.2
## [193] alabaster.base_1.4.0 shiny_1.10.0
## [195] htmltools_0.5.8.1 sctransform_0.4.2
## [197] rappdirs_0.3.3 glue_1.8.0
## [199] spam_2.11-1 httr2_1.0.1
## [201] XVector_0.44.0 RCurl_1.98-1.17
## [203] scran_1.32.0 gridExtra_2.3
## [205] boot_1.3-30 R6_2.6.1
## [207] labeling_0.4.3 rngtools_1.5.2
## [209] cluster_2.1.6 Rhdf5lib_1.26.0
## [211] statnet.common_4.9.0 nloptr_2.2.1
## [213] DelayedArray_0.30.0 tidyselect_1.2.1
## [215] vipor_0.4.7 car_3.1-2
## [217] AnnotationDbi_1.66.0 rsvd_1.0.5
## [219] KernSmooth_2.23-22 data.table_1.17.0
## [221] ComplexHeatmap_2.20.0 htmlwidgets_1.6.4
## [223] RColorBrewer_1.1-3 rlang_1.1.6
## [225] spatstat.sparse_3.1-0 spatstat.explore_3.4-2
## [227] uuid_1.2-1 remotes_2.5.0
## [229] Cairo_1.6-2 beeswarm_0.4.0